#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Graph-GAP: Reproduce the trial-run scoring datasets.

This script regenerates:
  - units_coder_level.csv
  - units_aggregated.csv
  - requirement_summary.csv
  - reliability_metrics.csv
  - weighted_kappa_pairs.csv
  - external_proxy_signals.csv
  - external_proxy_summary.csv
  - audit_log.csv
  - run_config.json

Copyright note:
  The script reads your local PDFs and computes hashes/anchors.
  It does NOT export full verbatim text by default.
  Use --include_debug_excerpts to export short excerpts for debugging (<=12 words).

Usage:
  python reproduce.py \
    --unicef_pdf "人工智能与儿童.pdf" \
    --out_dir "out_graphgap" \
    --external_pdfs "NIST_AI_RMF_1_0.pdf" "EU_AI_Act_2024_1689.pdf" "Europol_IOCTA_2024.pdf"

Dependencies:
  pip install pymupdf pandas numpy
"""

import argparse
import datetime
import hashlib
import json
import re
from pathlib import Path

import numpy as np
import pandas as pd
import fitz  # PyMuPDF

REQS = {
    "R1": "governance regulation regulatory law policy compliance oversight enforcement regulator government authority",
    "R2": "safety security protect protection abuse exploitation grooming violence harmful content age assurance moderation",
    "R3": "privacy data personal information consent profiling tracking biometric anonymisation confidentiality",
    "R4": "discrimination bias fairness equality equitable non-discrimination marginalized race gender disability",
    "R5": "transparency explainability explainable disclosure accountability audit documentation reporting traceability",
    "R6": "responsible responsibility due diligence human rights ethics lifecycle design development deployment",
    "R7": "wellbeing well-being development best interests health mental psychosocial education flourishing",
    "R8": "inclusion inclusive accessibility accessible disability underserved language minority all children participation",
    "R9": "skills literacy education training capacity building empowerment digital literacy ai literacy teachers parents",
    "R10": "ecosystem collaboration multi-stakeholder partnership research innovation investment infrastructure standards",
}
REQ_KW = {k: set(v.split()) for k, v in REQS.items()}

E_TERMS = set(
    "evidence data research study studies report statistics monitor monitoring evaluate evaluation measurement indicator indicators baseline".split()
)
M_RISK = set("risk risks harmful harm harms abuse exploitation bias discrimination manipulation coercion".split())
M_CONTROL = set("mitigate prevent reduce ensure safeguard protect detect respond".split())
M_CAUSAL = set("because so therefore leads results causes causing to prevent in order to so that".split())
G_ACTOR = set(
    "government regulator regulators authority authorities company companies business businesses provider providers developer developers platform platforms operator operators".split()
)
G_PROC = set(
    "require required ensure accountability accountable oversight enforce enforcement audit auditing complaint complaints remedy redress sanction sanctions".split()
)
K_TERMS = set(
    "measure measurement metric metrics indicator indicators monitor monitoring benchmark threshold targets frequency report reporting log logging audit".split()
)
FREQ_TERMS = set("daily weekly monthly quarterly annually yearly".split())
TOOLS = set(
    "impact assessment risk assessment dpia audit log logging transparency report governance framework standard policy procedure".split()
)
STANDARDS = set("nist iso iec oecd gdpr dsa airmf".split())
SIGNAL_TERMS = K_TERMS | set(
    "assess assessment impact risk management governance audit logging complaint redress oversight enforce".split()
) | TOOLS | STANDARDS

def sha256_file(path: str, chunk: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def extract_pdf_pages_text(path: str):
    doc = fitz.open(path)
    pages = []
    for i in range(doc.page_count):
        t = doc.load_page(i).get_text("text") or ""
        t = re.sub(r"[ \t]+", " ", t)
        t = re.sub(r"\n{3,}", "\n\n", t)
        pages.append((i + 1, t))
    doc.close()
    return pages

def split_sentences_with_offsets(text: str):
    t = text.replace("•", ". ").replace("\u2022", ". ")
    t = re.sub(r"[ \t]+", " ", t)
    spans = []
    start = 0
    for m in re.finditer(r"[\.\!\?]\s+", t):
        end = m.end()
        seg = t[start:end].strip()
        if len(seg) >= 40 and "[PAGE" not in seg:
            spans.append((start, end, seg))
        start = end
    tail = t[start:].strip()
    if len(tail) >= 40 and "[PAGE" not in tail:
        spans.append((start, len(t), tail))
    return spans, t

def page_from_context(context: str, idx: int):
    m = list(re.finditer(r"\[PAGE\s+(\d+)\]", context[:idx]))
    if not m:
        return None
    return int(m[-1].group(1))

def map_to_rk(sentence: str):
    words = set(re.findall(r"[A-Za-z]+", sentence.lower()))
    scores = {rk: len(words & kw) for rk, kw in REQ_KW.items()}
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else None

def has_number(s: str) -> bool:
    return bool(re.search(r"\b\d+(\.\d+)?\b", s))

def score_dims(sentence: str, coder: str = "A"):
    w = set(re.findall(r"[A-Za-z]+", sentence.lower()))
    e_cnt = len(w & E_TERMS) + (2 if has_number(sentence) else 0)
    k_cnt = len(w & K_TERMS) + (1 if has_number(sentence) else 0) + (1 if len(w & FREQ_TERMS) > 0 else 0)
    risk = len(w & M_RISK) > 0
    harm = ("harm" in w or "harmful" in w or "abuse" in w or "exploitation" in w)
    control = len(w & M_CONTROL) > 0
    causal = len(w & M_CAUSAL) > 0
    actor = len(w & G_ACTOR) > 0
    proc = len(w & G_PROC)
    tools = len(w & TOOLS) + len(w & STANDARDS)

    if coder == "A":
        e_thr = (0, 1, 3, 5); k_thr = (0, 1, 2, 4); proc_thr = (0, 1, 2, 4); tool_thr = (0, 1, 2, 4)
    elif coder == "B":
        e_thr = (0, 1, 2, 4); k_thr = (0, 1, 2, 3); proc_thr = (0, 1, 2, 3); tool_thr = (0, 1, 2, 3)
    else:
        e_thr = (0, 1, 2, 3); k_thr = (0, 1, 1, 3); proc_thr = (0, 1, 2, 3); tool_thr = (0, 1, 2, 3)

    if e_cnt >= e_thr[3]:
        E = 1
    elif e_cnt >= e_thr[2]:
        E = 2
    elif e_cnt >= e_thr[1]:
        E = 3
    else:
        E = 5 if re.search(r"\b(should|must|need to|shall)\b", sentence.lower()) else 4

    if risk and harm and control:
        M = 1
    elif control and (risk or harm) and causal:
        M = 2
    elif causal and (risk or harm or control):
        M = 3
    elif risk or harm:
        M = 4
    else:
        M = 5

    if actor and proc >= proc_thr[3]:
        G = 1
    elif actor and proc >= proc_thr[2]:
        G = 2
    elif actor and proc >= proc_thr[1]:
        G = 3
    elif actor or proc > 0:
        G = 4
    else:
        G = 5

    if k_cnt >= k_thr[3]:
        K = 1
    elif k_cnt >= k_thr[2]:
        K = 2
    elif k_cnt >= k_thr[1]:
        K = 3
    else:
        K = 5

    if tools >= tool_thr[3]:
        R = 5
    elif tools >= tool_thr[2]:
        R = 4
    elif tools >= tool_thr[1]:
        R = 3
    else:
        R = 1 if (actor or control or proc > 0) else 0

    return E, M, G, K, R

def gap_score(E: float, M: float, G: float, K: float) -> float:
    return 0.25 * (E + M + G + K)

def bootstrap_ci(values, n=2000, alpha=0.05, seed=7):
    rng = np.random.default_rng(seed)
    values = np.array(values, dtype=float)
    if len(values) == 0:
        return (np.nan, np.nan)
    boots = []
    for _ in range(n):
        samp = rng.choice(values, size=len(values), replace=True)
        boots.append(samp.mean())
    lo = float(np.quantile(boots, alpha / 2))
    hi = float(np.quantile(boots, 1 - alpha / 2))
    return lo, hi

def percentile_ci(values, q=0.8, n=2000, alpha=0.05, seed=7):
    rng = np.random.default_rng(seed)
    values = np.array(values, dtype=float)
    boots = []
    for _ in range(n):
        samp = rng.choice(values, size=len(values), replace=True)
        boots.append(np.quantile(samp, q))
    lo = float(np.quantile(boots, alpha / 2))
    hi = float(np.quantile(boots, 1 - alpha / 2))
    return lo, hi

def krippendorff_alpha_ordinal(data, min_rating=1, max_rating=5):
    data = np.array(data, dtype=float)
    def dist(a, b):
        return ((a - b) / (max_rating - min_rating)) ** 2
    Do = 0.0
    n_pairs = 0
    for i in range(data.shape[0]):
        vals = data[i, ~np.isnan(data[i])]
        if len(vals) < 2:
            continue
        for a in range(len(vals)):
            for b in range(a + 1, len(vals)):
                Do += dist(vals[a], vals[b]); n_pairs += 1
    if n_pairs == 0:
        return np.nan
    Do /= n_pairs
    all_vals = data[~np.isnan(data)]
    if len(all_vals) < 2:
        return np.nan
    cats = np.arange(min_rating, max_rating + 1)
    freqs = np.array([(all_vals == c).sum() for c in cats], dtype=float)
    p = freqs / freqs.sum()
    De = 0.0
    for i, a in enumerate(cats):
        for j, b in enumerate(cats):
            De += p[i] * p[j] * dist(a, b)
    if De == 0:
        return 1.0
    return 1 - Do / De

def weighted_kappa_quadratic(a, b, min_rating=1, max_rating=5):
    a = np.array(a, dtype=float); b = np.array(b, dtype=float)
    mask = ~(np.isnan(a) | np.isnan(b))
    a = a[mask].astype(int); b = b[mask].astype(int)
    k = max_rating - min_rating + 1
    O = np.zeros((k, k), dtype=float)
    for x, y in zip(a, b):
        O[x - min_rating, y - min_rating] += 1
    if O.sum() == 0:
        return np.nan
    O /= O.sum()
    W = np.zeros((k, k), dtype=float)
    for i in range(k):
        for j in range(k):
            W[i, j] = ((i - j) / (k - 1)) ** 2
    r = O.sum(axis=1); c = O.sum(axis=0)
    E = np.outer(r, c)
    num = (W * O).sum(); den = (W * E).sum()
    return 1 - num / den if den > 0 else np.nan

def icc_2k(ratings):
    X = np.array(ratings, dtype=float)
    mask = ~np.isnan(X).any(axis=1)
    X = X[mask]
    n, k = X.shape
    if n < 2:
        return np.nan
    mean_row = X.mean(axis=1, keepdims=True)
    mean_col = X.mean(axis=0, keepdims=True)
    grand = X.mean()
    SSR = k * ((mean_row - grand) ** 2).sum()
    SSC = n * ((mean_col - grand) ** 2).sum()
    SSE = ((X - mean_row - mean_col + grand) ** 2).sum()
    MSR = SSR / (n - 1)
    MSC = SSC / (k - 1) if k > 1 else 0
    MSE = SSE / ((n - 1) * (k - 1)) if k > 1 else 0
    ICC2k = (MSR - MSE) / (MSR + (MSC - MSE) / n)
    return ICC2k

def build_units_from_unicef(unicef_pdf: str):
    pages = extract_pdf_pages_text(unicef_pdf)
    joined = "\n\n".join([f"[PAGE {p}]\n{t}" for p, t in pages])
    first_after = {}
    for i in range(1, 11):
        m = re.search(rf"\bRequirement\s+{i}\b", joined, flags=re.IGNORECASE)
        if m:
            first_after[i] = m.start()
    use_heading_blocks = (len(first_after) >= 8)
    unit_rows = []
    unit_idx = 0
    if use_heading_blocks:
        positions = sorted([(i, first_after[i]) for i in first_after], key=lambda x: x[1])
        end_pos = len(joined)
        blocks = {}
        for j, (num, pos) in enumerate(positions):
            next_pos = positions[j + 1][1] if j + 1 < len(positions) else end_pos
            blocks[num] = joined[pos:next_pos]
        for num, block in blocks.items():
            spans, norm = split_sentences_with_offsets(block)
            rk = f"R{num}"
            for (s0, s1, seg) in spans:
                pno = page_from_context(norm, s0)
                seg = re.sub(r"\[PAGE\s+\d+\]", "", seg).strip()
                if len(seg) < 40:
                    continue
                unit_idx += 1
                uid = f"{rk}-p{(pno or 0):02d}-u{unit_idx:05d}"
                h = hashlib.sha256(seg.encode("utf-8")).hexdigest()
                unit_rows.append({"unit_id": uid, "page": pno, "requirement": rk, "unit_rank": unit_idx, "text_sha256": h, "text_len_chars": len(seg), "text": seg})
    else:
        for pno, txt in pages:
            lines = [ln.strip() for ln in re.split(r"\n+", txt) if len(ln.strip()) >= 60]
            for ln in lines:
                rk = map_to_rk(ln)
                if rk is None:
                    continue
                unit_idx += 1
                uid = f"{rk}-p{pno:02d}-u{unit_idx:05d}"
                h = hashlib.sha256(ln.encode("utf-8")).hexdigest()
                unit_rows.append({"unit_id": uid, "page": pno, "requirement": rk, "unit_rank": unit_idx, "text_sha256": h, "text_len_chars": len(ln), "text": ln})
    return pd.DataFrame(unit_rows), ("heading_blocks" if use_heading_blocks else "sentence_fallback")

def build_scores(units_df: pd.DataFrame):
    rows = []
    for _, r in units_df.iterrows():
        seg = r["text"]
        for coder in ["A", "B", "C"]:
            E, M, G, K, R = score_dims(seg, coder)
            rows.append(
                {
                    "unit_id": r["unit_id"], "page": r["page"], "requirement": r["requirement"], "unit_rank": r["unit_rank"],
                    "coder": coder, "E": E, "M": M, "G": G, "K": K, "Readiness": R, "GapScore": gap_score(E, M, G, K),
                    "text_sha256": r["text_sha256"], "text_len_chars": r["text_len_chars"]
                }
            )
    return pd.DataFrame(rows)

def aggregate_units(coder_level: pd.DataFrame):
    pivot = coder_level.pivot_table(
        index=["unit_id", "page", "requirement", "unit_rank", "text_sha256", "text_len_chars"],
        columns="coder",
        values=["E", "M", "G", "K", "Readiness", "GapScore"],
        aggfunc="first",
    )
    pivot.columns = [f"{a}_{b}" for a, b in pivot.columns]
    pivot = pivot.reset_index()
    def agg_row(r, dim):
        vals = np.array([r[f"{dim}_A"], r[f"{dim}_B"], r[f"{dim}_C"]], dtype=float)
        med = float(np.median(vals))
        if (np.nanmax(vals) - np.nanmin(vals)) >= 2:
            if dim == "Readiness":
                return float(np.nanmin(vals)), True
            return float(np.nanmax(vals)), True
        return med, False
    out = []
    for _, r in pivot.iterrows():
        row = {k: r[k] for k in ["unit_id", "page", "requirement", "unit_rank", "text_sha256", "text_len_chars"]}
        flagged = False
        for dim in ["E", "M", "G", "K", "Readiness"]:
            v, f = agg_row(r, dim)
            row[dim] = v
            flagged = flagged or f
        row["GapScore"] = gap_score(row["E"], row["M"], row["G"], row["K"])
        row["flag_disagreement"] = flagged
        out.append(row)
    return pd.DataFrame(out), pivot

def summarize_requirements(units: pd.DataFrame, bootstrap_n=2000, seed=7):
    stats = []
    for rk, grp in units.groupby("requirement"):
        gs = grp["GapScore"].to_numpy()
        rd = grp["Readiness"].to_numpy()
        lo, hi = bootstrap_ci(gs, n=bootstrap_n, seed=seed)
        rlo, rhi = percentile_ci(rd, q=0.8, n=bootstrap_n, seed=seed)
        stats.append(
            {
                "Requirement": rk, "n_units": len(grp),
                "E_mean": grp["E"].mean(), "M_mean": grp["M"].mean(), "G_mean": grp["G"].mean(), "K_mean": grp["K"].mean(),
                "GapScore_mean": gs.mean(), "GapScore_CI_low": lo, "GapScore_CI_high": hi,
                "Readiness_p80": float(np.quantile(rd, 0.8)), "Readiness_CI_low": rlo, "Readiness_CI_high": rhi,
                "Readiness_share_ge3": float((rd >= 3).mean()),
            }
        )
    return pd.DataFrame(stats).sort_values("Requirement")

def reliability(pivot: pd.DataFrame):
    rel = {}
    icc = {}
    for dim in ["E", "M", "G", "K", "Readiness"]:
        mat = pivot[[f"{dim}_A", f"{dim}_B", f"{dim}_C"]].to_numpy(dtype=float)
        rel[f"alpha_{dim}"] = krippendorff_alpha_ordinal(mat, min_rating=0 if dim == "Readiness" else 1, max_rating=5)
        icc[f"ICC2k_{dim}"] = icc_2k(mat)
    icc["ICC2k_GapScore"] = icc_2k(pivot[["GapScore_A", "GapScore_B", "GapScore_C"]].to_numpy(dtype=float))
    pairs = [("A", "B"), ("A", "C"), ("B", "C")]
    kappa_rows = []
    for dim in ["E", "M", "G", "K", "Readiness"]:
        for a, b in pairs:
            kappa_rows.append(
                {
                    "dimension": dim,
                    "pair": f"{a}-{b}",
                    "kappa_quadratic": weighted_kappa_quadratic(
                        pivot[f"{dim}_{a}"], pivot[f"{dim}_{b}"], min_rating=0 if dim == "Readiness" else 1, max_rating=5
                    ),
                }
            )
    kappa_df = pd.DataFrame(kappa_rows)
    rel_df = pd.DataFrame(
        [
            {"dimension": "E", "krippendorff_alpha_ordinal": rel["alpha_E"], "icc_2k": icc["ICC2k_E"]},
            {"dimension": "M", "krippendorff_alpha_ordinal": rel["alpha_M"], "icc_2k": icc["ICC2k_M"]},
            {"dimension": "G", "krippendorff_alpha_ordinal": rel["alpha_G"], "icc_2k": icc["ICC2k_G"]},
            {"dimension": "K", "krippendorff_alpha_ordinal": rel["alpha_K"], "icc_2k": icc["ICC2k_K"]},
            {"dimension": "Readiness", "krippendorff_alpha_ordinal": rel["alpha_Readiness"], "icc_2k": icc["ICC2k_Readiness"]},
            {"dimension": "GapScore", "krippendorff_alpha_ordinal": np.nan, "icc_2k": icc["ICC2k_GapScore"]},
        ]
    )
    return rel_df, kappa_df

def extract_external_sentences(pdf_path: str, max_pages: int = 80):
    doc = fitz.open(pdf_path)
    out = []
    n = min(max_pages, doc.page_count)
    for i in range(n):
        t = doc.load_page(i).get_text("text") or ""
        t = re.sub(r"[ \t]+", " ", t)
        t = re.sub(r"\n{2,}", "\n", t)
        for line in t.split("\n"):
            line = line.strip()
            if len(line) < 60:
                continue
            parts = re.split(r"(?<=[\.\!\?])\s+", line)
            for p in parts:
                p = p.strip()
                if len(p) >= 60:
                    out.append((i + 1, p))
    doc.close()
    return out

def external_proxy(external_pdfs, max_pages=80):
    rows = []
    for name, path in external_pdfs:
        sents = extract_external_sentences(path, max_pages=max_pages)
        for pno, s in sents:
            rk = map_to_rk(s)
            if rk is None:
                continue
            h = hashlib.sha256(s.encode("utf-8")).hexdigest()
            w = set(re.findall(r"[A-Za-z]+", s.lower()))
            signal = int(len(w & SIGNAL_TERMS) > 0)
            rows.append({"doc": name, "page": pno, "requirement": rk, "signal": signal, "text_sha256": h, "text_len_chars": len(s)})
    return pd.DataFrame(rows)

def excerpt_words(s: str, n_words: int = 12) -> str:
    words = s.split()
    return " ".join(words[:n_words]) + (" …" if len(words) > n_words else "")

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--unicef_pdf", required=True)
    ap.add_argument("--external_pdfs", nargs="*", default=[])
    ap.add_argument("--out_dir", required=True)
    ap.add_argument("--bootstrap_n", type=int, default=2000)
    ap.add_argument("--seed", type=int, default=7)
    ap.add_argument("--max_external_pages", type=int, default=80)
    ap.add_argument("--include_debug_excerpts", action="store_true")
    args = ap.parse_args()

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    units_df, mode = build_units_from_unicef(args.unicef_pdf)
    coder_level = build_scores(units_df)
    units_agg, pivot = aggregate_units(coder_level)
    req_df = summarize_requirements(units_agg, bootstrap_n=args.bootstrap_n, seed=args.seed)
    rel_df, kappa_df = reliability(pivot)

    external_list = []
    for p in args.external_pdfs:
        p = Path(p)
        if p.exists():
            external_list.append((p.stem, str(p)))
    ext_df = external_proxy(external_list, max_pages=args.max_external_pages) if external_list else pd.DataFrame(
        columns=["doc", "page", "requirement", "signal", "text_sha256", "text_len_chars"]
    )
    if len(ext_df):
        ext_rate = ext_df.groupby("requirement")["signal"].mean().reset_index().rename(columns={"signal": "external_signal_rate"})
    else:
        ext_rate = pd.DataFrame({"requirement": [f"R{i}" for i in range(1, 11)], "external_signal_rate": [0.0] * 10})

    merged = req_df.merge(ext_rate, left_on="Requirement", right_on="requirement", how="left").drop(columns=["requirement"])
    merged["external_signal_rate"] = merged["external_signal_rate"].fillna(0.0)

    corr_ready = float(np.corrcoef(merged["Readiness_p80"], merged["external_signal_rate"])[0, 1]) if merged["external_signal_rate"].std() > 0 else 0.0
    corr_gap = float(np.corrcoef(merged["GapScore_mean"], merged["external_signal_rate"])[0, 1]) if merged["external_signal_rate"].std() > 0 else 0.0

    ext_summary = merged[["Requirement", "external_signal_rate", "GapScore_mean", "Readiness_p80", "Readiness_share_ge3"]].copy()
    ext_summary["pearson_corr_ready_vs_external"] = corr_ready
    ext_summary["pearson_corr_gap_vs_external"] = corr_gap

    audit_items = [
        ("Run timestamp (local)", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
        ("Extraction mode", mode),
        ("UNICEF PDF path", str(args.unicef_pdf)),
        ("UNICEF PDF SHA256", sha256_file(args.unicef_pdf)),
        ("Units extracted", str(len(units_df))),
        ("Bootstrap", f"{args.bootstrap_n} resamples; seed={args.seed}"),
        ("External PDFs", "; ".join([f"{n}:{p}" for n, p in external_list]) if external_list else ""),
    ]
    audit_df = pd.DataFrame(audit_items, columns=["Item", "Value"])

    run_config = {
        "bootstrap": {"resamples": args.bootstrap_n, "seed": args.seed, "ci": "95% percentile"},
        "aggregation_rule": "median across A/B/C; if max-min>=2 then conservative (max for gap dims, min for Readiness)",
        "scales": {"E/M/G/K": "1..5 (higher=worse gap)", "Readiness": "0..5 (higher=better implementability)"},
        "gapscore": "0.25*(E+M+G+K)",
        "extraction_mode": mode,
        "copyright_note": "By default, no verbatim sentences are exported; only hashes and anchors.",
    }

    coder_level.to_csv(out_dir / "units_coder_level.csv", index=False)
    units_agg.to_csv(out_dir / "units_aggregated.csv", index=False)
    req_df.to_csv(out_dir / "requirement_summary.csv", index=False)
    rel_df.to_csv(out_dir / "reliability_metrics.csv", index=False)
    kappa_df.to_csv(out_dir / "weighted_kappa_pairs.csv", index=False)
    ext_df.to_csv(out_dir / "external_proxy_signals.csv", index=False)
    ext_summary.to_csv(out_dir / "external_proxy_summary.csv", index=False)
    audit_df.to_csv(out_dir / "audit_log.csv", index=False)
    (out_dir / "run_config.json").write_text(json.dumps(run_config, ensure_ascii=False, indent=2), encoding="utf-8")

    if args.include_debug_excerpts:
        sample = units_df.sample(n=min(30, len(units_df)), random_state=args.seed).copy()
        sample["excerpt_12w"] = sample["text"].apply(lambda x: excerpt_words(x, 12))
        sample = sample[["unit_id", "page", "requirement", "unit_rank", "text_sha256", "excerpt_12w"]]
        sample.to_csv(out_dir / "debug_excerpts_30.csv", index=False)

    print("Done. Outputs written to:", out_dir)

if __name__ == "__main__":
    main()
